#delimit;
** REPLACE FILE PATH WITH PATH TO RELEVANT REPLICATION FILES;
local fileloc = "~/KMS_REPLICATION";
set more off;
set logtype text;
capture log close daily_pollution;

pause on;

log using `fileloc'/log_files/daily_pollution.txt, name(daily_pollution) replace;

**XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX;
**XXXXXXXXXXXXXXXX IMPORT POLLUTION DATA (DAILY LEVEL) XXXXXXXXXXXXXXXXXXXX;
**XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX;

clear all;

** Data downloaded from CARB website using DVD files (http://www.arb.ca.gov/aqd/aqdcd/aqdcddld.htm);

*************** START WITH CO  03 (TRUE DAILY) ********************;

** Variable names from dailygas_csv_headers.txt, part of DVD download;

insheet using `fileloc'/data/emissions_data/dlygas_csv.txt, clear comma nonames;

rename v2 location;
rename v3 date;
rename v9 comax8o;
rename v33 ozmax8o;

keep location date comax8o ozmax8o;
rename comax8o comax8hr;
rename ozmax8o ozmax8hr;

gen date2 = date(date, "MDY");
drop date;
rename date2 date;
format date %td;

sort location date;

keep if date >= td(1jan1988);

save `fileloc'/data/emissions_data/daily_CO_03.dta, replace;

************* NOW DO PM10, EVERY 6 DAYS ****************;

** Data from PM Mass Comma-delimited file (daily, hourly);

insheet using `fileloc'/data/emissions_data/PM10StdDaily20100120.txt, clear comma names;

** Some locations have more than one monitor, often different types of measuring systems. Correlation between values is incredibly strong. To maintain consistency, we keep only the monitor labeled "1", which we consider the primary, initial monitor;

gen date2 = date(date, "MDY");
drop date;
rename date2 date;
duplicates report site date;
keep if monitor == 1;
** Should now be no duplicates;
duplicates report site date;

keep if date >= td(1jan1988);

** There exist different sampling methods. The most popular by far is HVS09, which is short for "High Volume Sampler." After eliminating duplicates by restricting to monitor 1, almost 80% of all readings are on HVS09, with the next largest being around 4%. We keep only HVS09 measures, as other measurement systems may lead to higher or lower values based on measurement metrics, etc.;

** For a VERY SPARSE description of collection methods, see CollectionMethods20100120.txt, part of the expanded PM Mass Comma-delimited exe file from the website above;

** Keep only most common collection method;
keep if collectionmethod == "HVS09";

rename value pm10;

sort site date;

format date %td;

keep if date >= td(1jan1988);

** Rename site to location to match CO and O3 data;
rename site location;

**************** PUTTING ALL POLLUTANTS TOGETHER *******************;

tostring location, replace;
sort location date;
merge location date using `fileloc'/data/emissions_data/daily_CO_03.dta;
tab _merge;
drop _merge;

*************** ADDING LOCATION DATA FOR ZIP CALCS ****************;
sort location;
tempfile pollution;
save `pollution';

use `fileloc'/data/location_data/pollution_locations.dta, clear;
joinby location using `pollution';

keep location date pm10 comax8hr ozmax8hr;

** Unlikely "0" readings are ever correct (especially given surrounding values are not zeros). Likely missing values, so turn all 0s into missing here;
replace comax8hr = . if comax8hr == 0;
replace ozmax8hr = . if ozmax8hr == 0;
replace pm10 = . if pm10 == 0;

sort location date;

save `fileloc'/data/emissions_data/daily_all_pollutants.dta, replace;

gen year = year(date);
mean comax8hr ozmax8hr pm10, over(year);

log close daily_pollution;
